{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 程序功能：将kddcup99数据集特征或标签中的字符转换为数值表示"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 导入处理数据所需的库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 函数功能：将相应的非数字类型转换为数字标识，即字符型数据转化为数值型数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 取字符对应的索引表示该字符\n",
    "def find_index(x,y):\n",
    "    return [i for i in range(len(y)) if y[i]==x]   # Python列表解析,返回列表"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 函数功能：将原数据集中3种协议类型转换成数字标识"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def handleProtocol(inputs):\n",
    "    protocol_list=['tcp','udp','icmp']\n",
    "    if inputs[1] in protocol_list:\n",
    "        return find_index(inputs[1], protocol_list)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 函数功能：将原数据集中70种网络服务类型转换成数字标识"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def handleService(inputs):\n",
    "    service_list=['aol','auth','bgp','courier','csnet_ns','ctf','daytime','discard','domain','domain_u',\n",
    "                 'echo','eco_i','ecr_i','efs','exec','finger','ftp','ftp_data','gopher','harvest','hostnames',\n",
    "                 'http','http_2784','http_443','http_8001','imap4','IRC','iso_tsap','klogin','kshell','ldap',\n",
    "                 'link','login','mtp','name','netbios_dgm','netbios_ns','netbios_ssn','netstat','nnsp','nntp',\n",
    "                 'ntp_u','other','pm_dump','pop_2','pop_3','printer','private','red_i','remote_job','rje','shell',\n",
    "                 'smtp','sql_net','ssh','sunrpc','supdup','systat','telnet','tftp_u','tim_i','time','urh_i','urp_i',\n",
    "                 'uucp','uucp_path','vmnet','whois','X11','Z39_50']\n",
    "    if inputs[2] in service_list:\n",
    "        return find_index(inputs[2],service_list)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 函数功能：将原数据集中11种网络连接状态转换成数字标识"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def handleFlag(inputs):\n",
    "    flag_list=['OTH','REJ','RSTO','RSTOS0','RSTR','S0','S1','S2','S3','SF','SH']\n",
    "    if inputs[3] in flag_list:\n",
    "        return find_index(inputs[3],flag_list)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 函数功能：将原数据集中攻击类型转换成数字标识\n",
    "异常类型被细分为4大类共39种攻击类型，训练集中共出现22种攻击类型，而剩下的17种未知攻击类型出现在测试集中"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "global label_list  # 定义label_list为全局变量，用于存放39种攻击类型\n",
    "def handleLabel(inputs):\n",
    "    label_list=['normal.', 'buffer_overflow.', 'loadmodule.', 'perl.', 'neptune.', 'smurf.',\n",
    "                'guess_passwd.', 'pod.', 'teardrop.', 'portsweep.', 'ipsweep.', 'land.', 'ftp_write.',\n",
    "                'back.', 'imap.', 'satan.', 'phf.', 'nmap.', 'multihop.', 'warezmaster.', 'warezclient.',\n",
    "                'spy.', 'rootkit.']\n",
    "    # 在函数内部使用全局变量并修改它\n",
    "    if inputs[41] in label_list:\n",
    "        return find_index(inputs[41],label_list)[0]\n",
    "    else:\n",
    "        label_list.append(inputs[41])                # 如果发现出现新的攻击类型，将它添加到label_list\n",
    "        return find_index(inputs[41],label_list)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 主函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数值化done！\n"
     ]
    }
   ],
   "source": [
    "# 文件写入\n",
    "data_numerization = open(\"kddcup.data.numerization00.txt\", 'w', newline='')  # 新建文件用于存放数值化后的数据集\n",
    "if __name__=='__main__':\n",
    "    with open('kddcup.data.txt','r') as data_original:                       # 打开原始数据集文件\n",
    "        csv_reader = csv.reader(data_original)                               # 按行读取所有数据并返回由csv文件的每行组成的列表\n",
    "        csv_writer = csv.writer(data_numerization, dialect='excel')          # 先传入文件句柄\n",
    "        for row in csv_reader:                       # 循环读取数据\n",
    "            temp_line=np.array(row)                  # 将列表list转换为ndarray数组。                  \n",
    "            temp_line[1] = handleProtocol(row)       # 将源文件行中3种协议类型转换成数字标识\n",
    "            temp_line[2] = handleService(row)        # 将源文件行中70种网络服务类型转换成数字标识\n",
    "            temp_line[3] = handleFlag(row)           # 将源文件行中11种网络连接状态转换成数字标识\n",
    "            temp_line[41] = handleLabel(row)         # 将源文件行中23种攻击类型转换成数字标识\n",
    "            csv_writer.writerow(temp_line)           # 按行写入\n",
    "        data_numerization.close()\n",
    "        print('数值化done！')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
